Cluster Analyses and Text Analyses

Awesome things:

0. Load Packages

library(tidyverse) library(janitor) library(plotly) #less popular since increase with shiny, but still useful! library(factoextra) #used before with biplots for pca library(RColorBrewer)

cluster analyses library(NbClust) library(cluster) library(dendextend) library(ggdendro)

text analyses library(pdftools) library(tidytext) library(wordcloud)

1. k-means clustering

A. check it out

# make column headings look nice using janitor::clean_names()
iris_nice <- iris %>% 
  clean_names()

ggplot(iris_nice) +
  geom_point(aes(x = petal_length, y = petal_width, color = species))

# It's kinda obvious, but, let's ask "How many clusters do you THINK there should be, R?" 
# Use NbClust::NbClust
# [1:4] refers to columns
# also set minimum and maximum number of clusters to consider
number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 10 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
# Conclusion: "the best number of clusters is 2"
# But we'll stick with 3 clusters because of what we know about the data

B. Perform k-means, Explore outcome

# specify [columns] and number of clusters (3)
iris_km <- kmeans(iris_nice[1:4], 3)

# how many observations per cluster? 
iris_km$size
## [1] 62 38 50
# for each variable, what is the multivariate center location in 4D space? 
iris_km$centers
##   sepal_length sepal_width petal_length petal_width
## 1     5.901613    2.748387     4.393548    1.433871
## 2     6.850000    3.073684     5.742105    2.071053
## 3     5.006000    3.428000     1.462000    0.246000
# what cluster has each observation been assigned to?
iris_km$cluster
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
# take these cluster assignments, and then put this information in a df with the original data. wow!
iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))
View(iris_cl)

# let's check out how these clusters formed in 2 of the 4 total dimensions:
ggplot(iris_cl) +
  geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))

# to better conceptualize some of those other dimensions, use "pch=species" in visualization
ggplot(iris_cl) +
  geom_point(aes(x = petal_length, y = petal_width, color = cluster_no, pch = species)) +
  scale_color_brewer(palette = "Dark2") + 
  theme_light()

# but if we really want a 3D representation, use plotly! different syntax, here:
# plotly strengths: you can also create your own interactive widgets, and reactive output is maintained in an html file output (knitted)!!
plot_ly(x = iris_cl$petal_length,
        y = iris_cl$petal_width,
        z = iris_cl$sepal_width,
        type = "scatter3d",
        color = iris_cl$cluster_no, 
        symbol = ~iris_cl$species,
        marker = list(size = 3),
        colors = "Set1")
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

2. Heirarchical cluster analysis

Data: >150 countries,

Note: Very different orders of magnitude of some of these variables. It’s useful, then, to scale data using the scale() function.

A. Wrangle

#remember this syntax for working within R projects
wb_env <- read_csv("wb_env.csv")
## Parsed with column specification:
## cols(
##   name = col_character(),
##   region = col_character(),
##   electricity = col_double(),
##   agland = col_double(),
##   co2 = col_double(),
##   methane = col_double(),
##   ghg = col_double()
## )
#only keep top 20 ghg emitting countries
#syntax note: dash in "-ghg" refers to the column
wb_ghg_20 <- wb_env %>% 
  arrange(-ghg) %>% 
  head(20)

#now scale the data and coerce numerical back to a df 
#(since the scale function will automatically store the result as a list)
wb_scaled <- as.data.frame(scale(wb_ghg_20[3:7]))
# Update to add countries as rowNAMES
rownames(wb_scaled) <- wb_ghg_20$name

B. Calculate euclidean distances and do some agglomerative and divisive clustering. Results = dendrograms.

# NOW, compute dissimilarity values (Euclidean distances)!
# from stats::dist
diss <- dist(wb_scaled, method = "euclidean")

# Hierarchical agglomerative clustering (complete linkage)!
# this will create a dendrogram
# feed it the dissimilarity matrix (diss)
# complete is the default method, but write it anyways
hc_complete <- hclust(diss, method = "complete")

# Plot it (base plot):
plot(hc_complete, cex = 0.6, hang = -1)

# Now cluster these divisively - and see some differences in output.
hc_div <- diana(diss)
plot(hc_div)

C. Compare these results, since they differ slightly.

# Convert to class dendrogram
dend1 <- as.dendrogram(hc_complete)
dend2 <- as.dendrogram(hc_div)

# Combine into list
dend_list <- dendlist(dend1,dend2)

#a tanglegram will show differences between dendograms
# if the methods had produced the same results, we would see straight lines across, here
tanglegram(dend1, dend2)

# Convert to class 'dendro' for ggplotting
data1 <- dendro_data(hc_complete)

# Simple plot with ggdendrogram
ggdendrogram(hc_complete, 
             rotate = TRUE) +
  theme_minimal() +
  labs(x = "Country")

# Want to do it actually in ggplot? Here: 
label_data <- bind_cols(filter(segment(data1), x == xend & x%%1 == 0), label(data1))

ggplot() + 
geom_segment(data=segment(data1), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=label_data, aes(x=xend, y=yend, label=label, hjust=0), size=2) +
coord_flip() + 
scale_y_reverse(expand=c(0.2, 0)) +
theme_bw() +
theme(panel.border = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.line = element_blank(),
      axis.title = element_blank(),
      axis.text = element_blank(),
      axis.ticks = element_blank(),
      legend.position = "None") 

3. Intro to Text Analysis

pdftools, stringr, tidytext

A. Extract and analyze information from a pdf, using Greta Thunberg’s speech from COP24

#note new syntax in specifying file path
greta <- file.path("greta_thunberg.pdf")
thunberg_text <- pdf_text(greta)

#bring in as a dataframe, but it all shows up in one cell
#can use \n as a delimiter to break things up
#note that 'text' is the column name
thunberg_df <- data.frame(text = thunberg_text) %>% 
  mutate(text_full = str_split(text, '\\n')) %>% 
  unnest(text_full)

#now just keep speech text (get rid of headers from pdf)
speech_text <- thunberg_df %>% 
  select(text_full) %>% 
  slice(4:18)

#and finally separate out individual words
sep_words <- speech_text %>% 
  unnest_tokens(word, text_full)

#and count how many times words show up
word_count <- sep_words %>% 
  count(word, sort=TRUE)

#...but common terms like pronouns that we don't want to analyze are here!
#HAHA! R has built-in lexicons to help us sort these out
#see stop_words documentation

# Remove the stop words
words_stop <- sep_words %>% 
  anti_join(stop_words) 
## Joining, by = "word"
# And we can count them
word_count <- words_stop %>% 
  count(word, sort = TRUE) # Count words and arrange
word_count
## # A tibble: 65 x 2
##    word           n
##    <chr>      <int>
##  1 children       3
##  2 people         3
##  3 speak          3
##  4 care           2
##  5 climate        2
##  6 justice        2
##  7 matter         2
##  8 sacrificed     2
##  9 sweden         2
## 10 15             1
## # ... with 55 more rows

B. More stuff. Example sentiment values from lexicon:

pos_words <- get_sentiments("afinn") %>% 
  filter(score == 5 | score == 4) %>% 
  head(20)
pos_words
## # A tibble: 20 x 2
##    word         score
##    <chr>        <int>
##  1 amazing          4
##  2 awesome          4
##  3 breathtaking     5
##  4 brilliant        4
##  5 ecstatic         4
##  6 euphoric         4
##  7 exuberant        4
##  8 fabulous         4
##  9 fantastic        4
## 10 fun              4
## 11 funnier          4
## 12 funny            4
## 13 godsend          4
## 14 heavenly         4
## 15 hurrah           5
## 16 lifesaver        4
## 17 lmao             4
## 18 lmfao            4
## 19 masterpiece      4
## 20 masterpieces     4
neutral_words <- get_sentiments("afinn") %>% 
  filter(between(score, -1,1)) %>% 
  head(20)
neutral_words
## # A tibble: 20 x 2
##    word       score
##    <chr>      <int>
##  1 aboard         1
##  2 absentee      -1
##  3 absentees     -1
##  4 absorbed       1
##  5 accept         1
##  6 accepted       1
##  7 accepting      1
##  8 accepts        1
##  9 achievable     1
## 10 active         1
## 11 adequate       1
## 12 admit         -1
## 13 admits        -1
## 14 admitted      -1
## 15 adopt          1
## 16 adopts         1
## 17 advanced       1
## 18 affected      -1
## 19 afflicted     -1
## 20 affronted     -1

C. More stuff. Bind some lexicon information to our actual speech words (non stop words)

an aside: when in doubt, use full join, keep everything!

sent_afinn <- words_stop %>% 
  inner_join(get_sentiments("afinn"))
## Joining, by = "word"
sent_afinn
##             word score
## 1        justice     2
## 2         matter     1
## 3         matter     1
## 4  uncomfortable    -2
## 5         growth     2
## 6         scared    -2
## 7            bad    -3
## 8           mess    -2
## 9      emergency    -2
## 10        mature     2
## 11        burden    -2
## 12         leave    -1
## 13          care     2
## 14       popular     3
## 15          care     2
## 16       justice     2
## 17   opportunity     2
## 18          rich     2
## 19           pay    -1
## 20     celebrate     3
#removed "greta" because no matches, via inner join, with the lexicon

sent_nrc <- words_stop %>% 
  inner_join(get_sentiments("nrc"))
## Joining, by = "word"
sent_nrc
##             word    sentiment
## 1        justice     positive
## 2        justice        trust
## 3         school        trust
## 4  uncomfortable     negative
## 5          green          joy
## 6          green     positive
## 7          green        trust
## 8         growth     positive
## 9      unpopular      disgust
## 10     unpopular     negative
## 11     unpopular      sadness
## 12          talk     positive
## 13       forward     positive
## 14           bad        anger
## 15           bad      disgust
## 16           bad         fear
## 17           bad     negative
## 18           bad      sadness
## 19          mess      disgust
## 20          mess     negative
## 21          pull     positive
## 22     emergency         fear
## 23     emergency     negative
## 24     emergency      sadness
## 25     emergency     surprise
## 26         leave     negative
## 27         leave      sadness
## 28         leave     surprise
## 29       justice     positive
## 30       justice        trust
## 31  civilization     positive
## 32  civilization        trust
## 33   opportunity anticipation
## 34   opportunity     positive
## 35      continue anticipation
## 36      continue     positive
## 37      continue        trust
## 38         money        anger
## 39         money anticipation
## 40         money          joy
## 41         money     positive
## 42         money     surprise
## 43         money        trust
## 44        luxury          joy
## 45        luxury     positive
## 46           pay anticipation
## 47           pay          joy
## 48           pay     positive
## 49           pay        trust
## 50      birthday anticipation
## 51      birthday          joy
## 52      birthday     positive
## 53      birthday     surprise
nrc_count <- sent_nrc %>% 
  group_by(sentiment) %>% 
  tally()
nrc_count
## # A tibble: 10 x 2
##    sentiment        n
##    <chr>        <int>
##  1 anger            2
##  2 anticipation     5
##  3 disgust          3
##  4 fear             2
##  5 joy              5
##  6 negative         6
##  7 positive        14
##  8 sadness          4
##  9 surprise         4
## 10 trust            8

D. What can we do with these results?

See key for more analyses. But here, let’s build a Word Cloud!

wordcloud(word_count$word, 
          freq = word_count$n, 
          min.freq = 1, 
          max.words = 65, 
          scale = c(2, 0.1),
          colors = brewer.pal(3, "Dark2"))